# load necessary libraries
import os
import requests
import zipfile
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import folium
import json
from IPython.display import display, HTML
from folium.plugins import TimeSliderChoropleth
from shapely.ops import nearest_pointsLoading the data
First, we will load our data from the following sources:
- All motor vehicle accidents in Philadelphia, 2018-2023 (PennDOT)
- Filtered by accidents involving non-motorists
- American Community Survey 5-year estimates (USCB ACS via
census) - Philadelphia road network from
osmnx
# define download paths
base_url = "https://gis.penndot.gov/gishub/crashZip/County/Philadelphia/Philadelphia_"
download_dir = "philly_crash_data"
os.makedirs(download_dir, exist_ok=True)
# create an empty list to store dataframes
crash_data_list = []
# loop through 2018 to 2023
for year in range(2018, 2024):
print(f"Processing data for year {year}...")
zip_filename = f"{download_dir}/Philadelphia_{year}.zip"
csv_filename = f"{download_dir}/CRASH_PHILADELPHIA_{year}.csv"
# download the zip file if it doesn't exist yet
if not os.path.exists(zip_filename):
url = f"{base_url}{year}.zip"
response = requests.get(url)
if response.status_code == 200:
with open(zip_filename, "wb") as file:
file.write(response.content)
print(f"Downloaded: {zip_filename}")
else:
print(f"Failed to download data for year {year}. URL: {url}")
continue
# extract the csv
if not os.path.exists(csv_filename):
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
csv_files = [name for name in zip_ref.namelist() if name.endswith(f"CRASH_PHILADELPHIA_{year}.csv")]
if csv_files:
zip_ref.extract(csv_files[0], download_dir)
print(f"Extracted: {csv_filename}")
else:
print(f"Relevant CSV file not found in {zip_filename}")
continue
# load csv as df
if os.path.exists(csv_filename):
df = pd.read_csv(csv_filename)
df['year'] = year
crash_data_list.append(df)
# combine all years into a single df
if crash_data_list:
all_crashes = pd.concat(crash_data_list, ignore_index=True)
print("Combined all years into a single DataFrame.")
else:
print("No data loaded.")
# delete the zip and csv files
for year in range(2018, 2024):
zip_filename = f"{download_dir}/Philadelphia_{year}.zip"
csv_filename = f"{download_dir}/CRASH_PHILADELPHIA_{year}.csv"
if os.path.exists(zip_filename):
os.remove(zip_filename)
print(f"Deleted: {zip_filename}")
if os.path.exists(csv_filename):
os.remove(csv_filename)
print(f"Deleted: {csv_filename}")# select for non-motorist crashes only
non_motorist_crashes = all_crashes[all_crashes['NONMOTR_COUNT'] > 0]
# filter out records with missing geography
non_motorist_crashes = non_motorist_crashes.dropna(subset=['DEC_LAT', 'DEC_LONG'])
# convert to gdf using lat and long
geometry = gpd.points_from_xy(non_motorist_crashes['DEC_LONG'], non_motorist_crashes['DEC_LAT'])
non_motorist_gdf = gpd.GeoDataFrame(non_motorist_crashes, geometry=geometry, crs="EPSG:4326")Visualizing the crash data
Crash point data by year
Now that we have imported and prepared the crash data, we will quickly visualize a subset of them (2021-2023, for the sake of Quarto rendering) in the following interactive map.
Years can be toggled on and off to see distributions of crashes across time. Reducing the number of years visualized as well as zooming into the map reduces the relative size of the dots, making it easier to see the spatial distribution of crashes.
# create a base map centered on Philadelphia
m = folium.Map(location=[39.9526, -75.1652], zoom_start=12, tiles="cartodb positron")
# filter out rows with year < 2021
filtered_gdf = non_motorist_gdf[non_motorist_gdf['year'] >= 2021]
# group crashes by year
year_groups = filtered_gdf.groupby('year')
# create a feature group for each year
year_layers = {}
for year, group in year_groups:
year_data = group.copy()
# create geometries from lat and long
year_data['geometry'] = gpd.GeoSeries.from_xy(year_data['DEC_LONG'], year_data['DEC_LAT'])
year_gdf = gpd.GeoDataFrame(year_data, geometry='geometry', crs="EPSG:4326")
# create a feature group for year
year_layer = folium.FeatureGroup(name=str(year))
for _, row in year_gdf.iterrows():
folium.CircleMarker(
location=[row['DEC_LAT'], row['DEC_LONG']],
radius=5,
popup=f"Crash ID: {row['CRN']}<br>Year: {row['year']}",
color='blue',
fill=True,
fill_color='blue',
fill_opacity=0.6
).add_to(year_layer)
# add year layer to dictionary
year_layers[year] = year_layer
# add all layers to map
for year, layer in year_layers.items():
layer.add_to(m)
# add layer control to toggle years with checkboxes
folium.LayerControl(collapsed=False).add_to(m)
# display map
mMake this Notebook Trusted to load map: File -> Trust Notebook
# save non motorist crashes as a geojson file for next notebook
non_motorist_gdf.to_file("non_motorist_gdf.geojson", driver="GeoJSON")